# Data pre-processing
import pandas as pd
import numpy as np
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
#Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as pl
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
%matplotlib inline
# Data Clustering
import sklearn
from sklearn import neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
#Data importing
countryvaccination = pd.read_csv("2021-05-19_country_vaccinations.csv")
cv = countryvaccination.copy()
countryvaccinationbyMan = pd.read_csv("2021-05-19_country_vaccinations_by_manufacturer.csv")
manuVac = countryvaccinationbyMan.copy()
#Converting to date format
vacc = countryvaccination.copy()
vacc['date'] = pd.to_datetime(vacc['date'], format='%Y-%m-%d')
vacc=vacc.set_index('date')
# Looking for the missing values
msno.bar(manuVac)
plt.show()
#Visualizing the missing data
msno.bar(vacc)
plt.show()
#Taking the maximim of vaccination counts
countryvaccination1 = countryvaccination.groupby(["country",'iso_code','vaccines'])['total_vaccinations',
'people_vaccinated','people_fully_vaccinated',
'daily_vaccinations','total_vaccinations_per_hundred',
'people_vaccinated_per_hundred',"people_fully_vaccinated_per_hundred"
,'daily_vaccinations_per_million'].max().reset_index()
countryvaccinationbyMan1 = countryvaccinationbyMan.groupby(["location",'vaccine'])['total_vaccinations'].max().reset_index()
countryvaccination1
| country | iso_code | vaccines | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | 504502.0 | 448878.0 | 55624.0 | 13921.0 | 1.30 | 1.15 | 0.14 | 358.0 |
| 1 | Albania | ALB | Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, ... | 632676.0 | 444755.0 | 187921.0 | 17565.0 | 21.98 | 15.45 | 6.53 | 6104.0 |
| 2 | Algeria | DZA | Sputnik V | 75000.0 | NaN | NaN | 3748.0 | 0.17 | NaN | NaN | 85.0 |
| 3 | Andorra | AND | Oxford/AstraZeneca, Pfizer/BioNTech | 31633.0 | 26931.0 | 4702.0 | 1182.0 | 40.94 | 34.86 | 6.09 | 15298.0 |
| 4 | Angola | AGO | Oxford/AstraZeneca | 626572.0 | 586377.0 | 40195.0 | 18751.0 | 1.91 | 1.78 | 0.12 | 571.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 206 | Wales | OWID_WLS | Moderna, Oxford/AstraZeneca, Pfizer/BioNTech | 2853576.0 | 1975820.0 | 877756.0 | 33151.0 | 90.51 | 62.67 | 27.84 | 10515.0 |
| 207 | Wallis and Futuna | WLF | Oxford/AstraZeneca | 5774.0 | 4286.0 | 1488.0 | 272.0 | 51.34 | 38.11 | 13.23 | 24186.0 |
| 208 | Yemen | YEM | Oxford/AstraZeneca | 18555.0 | 18555.0 | NaN | NaN | 0.06 | 0.06 | NaN | NaN |
| 209 | Zambia | ZMB | Oxford/AstraZeneca | 90916.0 | 90916.0 | NaN | 5680.0 | 0.49 | 0.49 | NaN | 309.0 |
| 210 | Zimbabwe | ZWE | Sinopharm/Beijing | 730365.0 | 549797.0 | 180568.0 | 22863.0 | 4.91 | 3.70 | 1.21 | 1538.0 |
211 rows × 11 columns
# Visualizing the total vaccinations throgh Choropleth
# We are plotting total vaccinations throughout the world and color bar indicates the count of total vaccinations.
# From this plot we can observe that China,UnitedStates and India has the total number of vaccinations count.
#But to take population count also into account to decide on best vaccination progress.
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
trace = go.Choropleth(
locations = countryvaccination1['country'],
locationmode='country names',
z = countryvaccination1['total_vaccinations'],
text = countryvaccination1['country'],
autocolorscale =False,
reversescale = True,
colorscale = 'viridis',
marker = dict(
line = dict(
color = 'rgb(0,0,0)',
width = 0.5)
),
colorbar = dict(
title = 'Total vaccinations',
tickprefix = '')
)
data = [trace]
layout = go.Layout(
title = 'Total vaccinations for country',
geo = dict(
showframe = True,
showlakes = False,
showcoastlines = True,
projection = dict(
type = 'natural earth'
)
)
)
fig = dict( data=data, layout=layout )
iplot(fig)
# Tree map of vaccinations used by respective country and its total vaccination count.From this plot,
#we can see the type of vaccinations used by top vaccinating countries.
fig = px.treemap(countryvaccination1,values = 'total_vaccinations',
path = ['vaccines','country'],
title="Total Vaccinations for a country")
fig.show()
#Top 10 countries which has the best vaccination rate from december 2020 to may 2021.
#The countries which are seen through legends of a plot.
fig = px.area(countryvaccination[countryvaccination.country.isin(countryvaccination[countryvaccination['date'] ==
countryvaccination['date'].max()].sort_values("total_vaccinations", ascending=False).iloc[:10].country)].sort_values("total_vaccinations", ascending=False),
x="date", y="total_vaccinations", color="country", template="plotly_dark")
fig.update_traces(line={"width":1.25})
fig.update_layout(title = f"Top 10 Countries with Most Vaccinated on {countryvaccination['date'].max()}",
xaxis={"title": "date"}, yaxis={"title": "total vaccinations" })
# Bar graph of total vaccinations per 100 for top 20 countries. From this plot we get the fastest vaccinating
# countries but with less population like Gibratar, Seychelies and Falkland Islands have given vaccinations to almost everyone.
x= countryvaccination1.groupby("country")["total_vaccinations_per_hundred"].max().sort_values(ascending= False).head(20)
plt.figure(figsize= (15,10))
ax= sns.barplot(x.values,x.index)
ax.set_xlabel("total vaccinations")
ax.set_ylabel("Country")
plt.show()
#Importing the world population dataset by worldometer
pop = pd.read_csv("population_by_country_2020.csv")
pop['country'] = pop['Country (or dependency)']
pop1 = pop[['country','Population (2020)']]
pop1.reset_index(drop=True)
| country | Population (2020) | |
|---|---|---|
| 0 | China | 1440297825 |
| 1 | India | 1382345085 |
| 2 | United States | 331341050 |
| 3 | Indonesia | 274021604 |
| 4 | Pakistan | 221612785 |
| ... | ... | ... |
| 230 | Montserrat | 4993 |
| 231 | Falkland Islands | 3497 |
| 232 | Niue | 1628 |
| 233 | Tokelau | 1360 |
| 234 | Holy See | 801 |
235 rows × 2 columns
# merging the population count to the vaccinations dataframe
cc= countryvaccination1.copy()
countryvaccination1 = countryvaccination1.merge(pop1,on='country')
#Calculating the vaccination rate of all the countries
countryvaccination1['percentage_fully_vaccinated'] = countryvaccination1['people_fully_vaccinated'].fillna(0) *100 / countryvaccination1['Population (2020)']
countryvaccination1['percentage_vaccinated'] = countryvaccination1['total_vaccinations'].fillna(0)/ countryvaccination1['Population (2020)']*100
#Plotting the vaccination rate of all countries and the color bar indicates the percentage of vaccinated.
#From this plot we can tell that europe has the highest people vaccinated according to their population rate.
fig = px.scatter_geo(countryvaccination1, locations="country", color="percentage_vaccinated",
locationmode='country names',
hover_name="country", size="percentage_vaccinated",
projection="natural earth")
fig.update_layout(title='The number of vaccinated people per population', title_x=0.45)
fig.show()
# bar plot indicates Daily vaccination count for top 10 countries.
#From this plot we can observe that China, USA ad India are the ones with high number of daily vaccination rate.
daily= countryvaccination.groupby("country").daily_vaccinations.mean().sort_values(ascending= False).head(10)
plt.figure(figsize= (15,10))
ax= sns.barplot(daily.values,daily.index)
ax.set_title("Daily Vaccinations Count")
ax.set_xlabel("Daily Vaccinations(avg)")
ax.set_ylabel("Country")
plt.show()
# Creating the dataframes for top vaccinating countries.
China = countryvaccination[countryvaccination['country']=='China']
UnitedStates = countryvaccination[countryvaccination['country']=='United States']
India = countryvaccination[countryvaccination['country']=='India']
Germany = countryvaccination[countryvaccination['country']=='Germany']
UK = countryvaccination[countryvaccination['country']=='United Kingdom']
China.to_csv("China.csv")
China1 = pd.read_csv('China.csv',index_col='date',parse_dates=True)
China1['total_vaccinations']= China1['total_vaccinations'].fillna(method='ffill').fillna(method='bfill')
China1['daily_vaccinations'] = China1['daily_vaccinations'].fillna(0)
China2 = China1[['daily_vaccinations']]
China3 = China1[['total_vaccinations']]
China1['daily_vaccinations'].plot(figsize=(12,6))
<AxesSubplot:xlabel='date'>
China2 = China2.dropna()
#Visuaizing the total vaccination, We can observe that there is no trend or seasonality
#in this plot to do the prediction
plt.figure(figsize=(10,6))
plt.plot(China2.diff())
[<matplotlib.lines.Line2D at 0x1e359c31d60>]
##Visuaizing the total vaccination difference, We can observe that there is no trend or seasonality
#in this plot to do the prediction. Hence we are ignoring this country for forecasting.
plt.figure(figsize=(10,6))
plt.plot(China3.diff())
[<matplotlib.lines.Line2D at 0x1e359c9cdc0>]
#Importing UnitedStates dataframe and pre-processing it.
UnitedStates.to_csv("UnitedStates.csv")
UnitedStates1 = pd.read_csv('UnitedStates.csv',index_col='date',parse_dates=True)
UnitedStates1['daily_vaccinations'] = UnitedStates1.daily_vaccinations.fillna(0)
UnitedStates1['total_vaccinations'] = UnitedStates1.total_vaccinations.fillna(method='ffill')
UnitedStates1.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 145 entries, 2020-12-20 to 2021-05-13 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 145 non-null int64 1 country 145 non-null object 2 iso_code 145 non-null object 3 total_vaccinations 145 non-null float64 4 people_vaccinated 130 non-null float64 5 people_fully_vaccinated 116 non-null float64 6 daily_vaccinations_raw 121 non-null float64 7 daily_vaccinations 145 non-null float64 8 total_vaccinations_per_hundred 131 non-null float64 9 people_vaccinated_per_hundred 130 non-null float64 10 people_fully_vaccinated_per_hundred 116 non-null float64 11 daily_vaccinations_per_million 144 non-null float64 12 vaccines 145 non-null object 13 source_name 145 non-null object 14 source_website 145 non-null object dtypes: float64(9), int64(1), object(5) memory usage: 18.1+ KB
UnitedStates2 = UnitedStates1[['total_vaccinations']]
#Plotting the total vaccination difference. From thsi plot we can see the seasoanl trend to analyze and do the prediction.
plt.figure(figsize=(10,6))
plt.plot(UnitedStates2.diff())
[<matplotlib.lines.Line2D at 0x1e35a48bb50>]
UnitedStates2=UnitedStates2.diff()
UnitedStates2=UnitedStates2.dropna()
#Splitting the dataset for training and testing.
start_date='2021-04-12'
train=UnitedStates2.loc[UnitedStates2.index < pd.to_datetime(start_date)]
test=UnitedStates2.loc[UnitedStates2.index >= pd.to_datetime(start_date)]
# Training the SARIMAX model for the time-series forecast analysis. Using this model,we can do the future prediction on total
# Vaccination count after a month of a may for 30 days.
from statsmodels.tsa.statespace.sarimax import SARIMAX
model=SARIMAX(UnitedStates2, order=(5, 1, 4))
results=model.fit(disp=True)
c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\base\model.py:567: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
# The predicted plot for the month of may. We can oberve the orange line representing the prediction and the
#blue as the true values.
sarimax_prediction=results.predict(
start=start_date, end='2021-05-13', dynamic=False)
plt.figure(figsize=(10,5))
l1,=plt.plot(UnitedStates2, label='Observation')
l2,=plt.plot(sarimax_prediction, label='ARIMA')
plt.legend(handles=[l1,l2])
plt.ticklabel_format(style="plain", axis = 'y')
plt.savefig('SARIMAX prediction', bbox_inches='tight', transparent=False)
# Running auto arima for to find the season order.
import pmdarima as pm
stepwise = pm.auto_arima(UnitedStates2,trace=True,suppress_warnings=True)
stepwise.summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=4258.520, Time=0.56 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=4302.524, Time=0.01 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=4293.495, Time=0.02 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=4269.341, Time=0.02 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=4300.564, Time=0.02 sec ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=4267.529, Time=0.06 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=4262.775, Time=0.09 sec ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=4229.788, Time=0.20 sec ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=4251.779, Time=0.10 sec ARIMA(4,1,2)(0,0,0)[0] intercept : AIC=4217.919, Time=0.32 sec ARIMA(4,1,1)(0,0,0)[0] intercept : AIC=4236.414, Time=0.10 sec ARIMA(5,1,2)(0,0,0)[0] intercept : AIC=4209.518, Time=0.70 sec ARIMA(5,1,1)(0,0,0)[0] intercept : AIC=4220.284, Time=0.19 sec ARIMA(5,1,3)(0,0,0)[0] intercept : AIC=4208.658, Time=0.78 sec ARIMA(4,1,3)(0,0,0)[0] intercept : AIC=4215.307, Time=1.06 sec ARIMA(5,1,4)(0,0,0)[0] intercept : AIC=4208.482, Time=1.74 sec ARIMA(4,1,4)(0,0,0)[0] intercept : AIC=4213.159, Time=0.81 sec ARIMA(5,1,5)(0,0,0)[0] intercept : AIC=4212.247, Time=0.77 sec ARIMA(4,1,5)(0,0,0)[0] intercept : AIC=4216.972, Time=0.89 sec ARIMA(5,1,4)(0,0,0)[0] : AIC=4214.991, Time=1.52 sec Best model: ARIMA(5,1,4)(0,0,0)[0] intercept Total fit time: 9.935 seconds
| Dep. Variable: | y | No. Observations: | 144 |
|---|---|---|---|
| Model: | SARIMAX(5, 1, 4) | Log Likelihood | -2093.241 |
| Date: | Tue, 01 Jun 2021 | AIC | 4208.482 |
| Time: | 02:27:51 | BIC | 4241.073 |
| Sample: | 0 | HQIC | 4221.725 |
| - 144 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 5.345e+04 | 2.82e+04 | 1.897 | 0.058 | -1783.588 | 1.09e+05 |
| ar.L1 | 0.2004 | 0.510 | 0.393 | 0.694 | -0.798 | 1.199 |
| ar.L2 | -0.2972 | 0.473 | -0.629 | 0.530 | -1.224 | 0.629 |
| ar.L3 | -0.5558 | 0.356 | -1.560 | 0.119 | -1.254 | 0.142 |
| ar.L4 | -0.2428 | 0.154 | -1.575 | 0.115 | -0.545 | 0.059 |
| ar.L5 | -0.2708 | 0.138 | -1.962 | 0.050 | -0.541 | -0.000 |
| ma.L1 | -1.0979 | 0.517 | -2.125 | 0.034 | -2.110 | -0.085 |
| ma.L2 | 0.4795 | 0.923 | 0.520 | 0.603 | -1.329 | 2.288 |
| ma.L3 | 0.3094 | 0.822 | 0.376 | 0.707 | -1.302 | 1.921 |
| ma.L4 | -0.2551 | 0.322 | -0.793 | 0.428 | -0.886 | 0.376 |
| sigma2 | 3.097e+11 | 0.005 | 6.82e+13 | 0.000 | 3.1e+11 | 3.1e+11 |
| Ljung-Box (Q): | 22.81 | Jarque-Bera (JB): | 136.99 |
|---|---|---|---|
| Prob(Q): | 0.99 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.42 | Skew: | 0.99 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 7.36 |
# Performing Adfuller test to check for the stationary.
from statsmodels.tsa.stattools import adfuller
def ad_test(dataset):
dftest=adfuller(dataset,autolag='AIC')
print("1. ADF: ", dftest[0])
print("2. P-Value: ", dftest[1])
print("3. Num of Lags: ", dftest[2])
print("4. Number of observations used for ADF Regression and critival values calculation: ", dftest[2])
print("5. Critical Values: ", dftest[4])
for key, val in dftest[4].items():
print("\t",key,":",val)
ad_test(UnitedStates2)
1. ADF: -1.9020246173694153
2. P-Value: 0.33110926706826116
3. Num of Lags: 9
4. Number of observations used for ADF Regression and critival values calculation: 9
5. Critical Values: {'1%': -3.480118600110386, '5%': -2.8833618426136196, '10%': -2.578407034974382}
1% : -3.480118600110386
5% : -2.8833618426136196
10% : -2.578407034974382
UnitedStates2.tail(1)
| total_vaccinations | |
|---|---|
| date | |
| 2021-05-13 | 1915642.0 |
# Fitting the model to predict the future of total vaccination count from may 14th to june 13th.
model2 = SARIMAX(UnitedStates2,order=(5,1,4))
model2= model2.fit(disp=True)
#model2.summary()
index_feature_dates=pd.date_range(start='2021-05-14', end='2021-06-13')
pred = model2.predict(start=len(UnitedStates2),end=len(UnitedStates2)+30).rename('total_vaccinations')
pred.index= index_feature_dates
print(pred)
c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
2021-05-14 2.560729e+06 2021-05-15 2.687125e+06 2021-05-16 2.391035e+06 2021-05-17 1.882785e+06 2021-05-18 1.593458e+06 2021-05-19 1.729720e+06 2021-05-20 2.182042e+06 2021-05-21 2.599084e+06 2021-05-22 2.666382e+06 2021-05-23 2.334943e+06 2021-05-24 1.852494e+06 2021-05-25 1.573583e+06 2021-05-26 1.697578e+06 2021-05-27 2.126618e+06 2021-05-28 2.544150e+06 2021-05-29 2.650234e+06 2021-05-30 2.378217e+06 2021-05-31 1.936060e+06 2021-06-01 1.647689e+06 2021-06-02 1.715392e+06 2021-06-03 2.077716e+06 2021-06-04 2.462621e+06 2021-06-05 2.591133e+06 2021-06-06 2.379247e+06 2021-06-07 1.992097e+06 2021-06-08 1.716060e+06 2021-06-09 1.747395e+06 2021-06-10 2.053168e+06 2021-06-11 2.402725e+06 2021-06-12 2.541308e+06 2021-06-13 2.375372e+06 Freq: D, Name: total_vaccinations, dtype: float64
c:\python\lib\site-packages\statsmodels\base\model.py:567: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
# This graph represents the total vaccinatios difference graph after concatenating the predicted values to the dataframe
preditions = pd.DataFrame(pred)
preditions
frames = [UnitedStates2, preditions]
UnitedStates2 = pd.concat(frames)
UnitedStates2.plot()
<AxesSubplot:>
sum1 = pred.sum()
sum1
67088160.22308184
UnitedStates3 = UnitedStates1.groupby(["country",'iso_code','vaccines'])['total_vaccinations',
'people_vaccinated','people_fully_vaccinated',
'daily_vaccinations','total_vaccinations_per_hundred',
'people_vaccinated_per_hundred',"people_fully_vaccinated_per_hundred"
,'daily_vaccinations_per_million'].max().reset_index()
UnitedStates3
| country | iso_code | vaccines | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | United States | USA | Johnson&Johnson, Moderna, Pfizer/BioNTech | 266596486.0 | 154624231.0 | 118987308.0 | 3384387.0 | 79.71 | 46.23 | 35.58 | 10120.0 |
sum2 = sum1 + UnitedStates3['total_vaccinations'].astype(int)
sum2
0 3.336846e+08 Name: total_vaccinations, dtype: float64
UnitedStates3['sum_daily_vaccinations_30days'] = sum1
UnitedStates3['total_vaccinations_after30days'] = sum2
UnitedStates3['total_vaccinations_after30days'] = UnitedStates3['total_vaccinations_after30days'].astype(int)
UnitedStates3
| country | iso_code | vaccines | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | sum_daily_vaccinations_30days | total_vaccinations_after30days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | United States | USA | Johnson&Johnson, Moderna, Pfizer/BioNTech | 266596486.0 | 154624231.0 | 118987308.0 | 3384387.0 | 79.71 | 46.23 | 35.58 | 10120.0 | 6.708816e+07 | 333684646 |
UnitedStates3['Vaccination rate'] =(1 - (UnitedStates3['total_vaccinations'] / UnitedStates3['total_vaccinations_after30days'])) * 100
# From this plot we can see the total vaccinations count after a month of may in the orange bars and the blue bar represents
# the total vaccination count before prediction
UnitedStates3.plot(x="vaccines", y=["total_vaccinations", "total_vaccinations_after30days"], kind="bar")
plt.ticklabel_format(style="plain", axis = 'y')
plt.xticks(rotation = 360)
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x1e35f0c9910>
# This figure represents the total vaccination rate increate in % after a month. We can see that 20% increase in the rate of total vaccination
# after a month.
fig = px.bar(UnitedStates3, x='country', y=["total_vaccinations", "sum_daily_vaccinations_30days"],
hover_data=['Vaccination rate'], color='vaccines',
title='Vaccination rate increase pred', height=400)
fig.show()
UK.to_csv("UK.csv")
UK1 = pd.read_csv('UK.csv',index_col='date',parse_dates=True)
UK1['daily_vaccinations'] = UK1.daily_vaccinations.fillna(0)
UK1['total_vaccinations'] = UK1.total_vaccinations.fillna(method='ffill')
UK2 = UK1[['total_vaccinations']]
plt.figure(figsize=(10,6))
plt.plot(UK2.diff())
[<matplotlib.lines.Line2D at 0x1e346cd4fd0>]
UK2=UK2.diff()
UK2=UK2.fillna(0)
start_date='2021-04-12'
train2=UK2.loc[UK2.index < pd.to_datetime(start_date)]
test2=UK2.loc[UK2.index >= pd.to_datetime(start_date)]
from statsmodels.tsa.statespace.sarimax import SARIMAX
model4=SARIMAX(UK2, order=(1, 1, 1))
results2=model4.fit(disp=True)
c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
sarimax_prediction2=results2.predict(
start=start_date, end='2021-05-13', dynamic=False)
plt.figure(figsize=(10,5))
l1,=plt.plot(UK2, label='Observation')
l2,=plt.plot(sarimax_prediction2, label='ARIMA')
plt.legend(handles=[l1,l2])
plt.ticklabel_format(style="plain", axis = 'y')
plt.savefig('SARIMAX prediction', bbox_inches='tight', transparent=False)
import pmdarima as pm
stepwise = pm.auto_arima(UK2,trace=True,suppress_warnings=True)
stepwise.summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=inf, Time=0.33 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=3512.991, Time=0.02 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=3504.356, Time=0.02 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=3488.613, Time=0.02 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=3511.078, Time=0.00 sec ARIMA(1,1,1)(0,0,0)[0] intercept : AIC=3484.388, Time=0.05 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=3484.574, Time=0.23 sec ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=3530.320, Time=0.16 sec ARIMA(0,1,2)(0,0,0)[0] intercept : AIC=3522.697, Time=0.11 sec ARIMA(2,1,0)(0,0,0)[0] intercept : AIC=3502.052, Time=0.04 sec ARIMA(1,1,1)(0,0,0)[0] : AIC=3478.238, Time=0.10 sec ARIMA(0,1,1)(0,0,0)[0] : AIC=3486.621, Time=0.05 sec ARIMA(1,1,0)(0,0,0)[0] : AIC=3502.488, Time=0.02 sec ARIMA(2,1,1)(0,0,0)[0] : AIC=3479.727, Time=0.21 sec ARIMA(1,1,2)(0,0,0)[0] : AIC=3515.292, Time=0.16 sec ARIMA(0,1,2)(0,0,0)[0] : AIC=3509.306, Time=0.09 sec ARIMA(2,1,0)(0,0,0)[0] : AIC=3500.222, Time=0.05 sec ARIMA(2,1,2)(0,0,0)[0] : AIC=inf, Time=0.64 sec Best model: ARIMA(1,1,1)(0,0,0)[0] Total fit time: 2.282 seconds
| Dep. Variable: | y | No. Observations: | 130 |
|---|---|---|---|
| Model: | SARIMAX(1, 1, 1) | Log Likelihood | -1736.119 |
| Date: | Tue, 01 Jun 2021 | AIC | 3478.238 |
| Time: | 02:27:57 | BIC | 3486.817 |
| Sample: | 0 | HQIC | 3481.724 |
| - 130 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| ar.L1 | 0.3577 | 0.072 | 4.958 | 0.000 | 0.216 | 0.499 |
| ma.L1 | -0.9218 | 0.055 | -16.857 | 0.000 | -1.029 | -0.815 |
| sigma2 | 3.368e+10 | 2.03e-12 | 1.66e+22 | 0.000 | 3.37e+10 | 3.37e+10 |
| Ljung-Box (Q): | 90.49 | Jarque-Bera (JB): | 2636.20 |
|---|---|---|---|
| Prob(Q): | 0.00 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 0.38 | Skew: | 2.62 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 24.52 |
Germany.to_csv("Germany.csv")
Germany1 = pd.read_csv('Germany.csv',index_col='date',parse_dates=True)
Germany1['daily_vaccinations'] = Germany1.daily_vaccinations.fillna(0)
Germany1['total_vaccinations'] = Germany1.total_vaccinations.fillna(method='ffill')
Germany2 = Germany1[['total_vaccinations']]
plt.figure(figsize=(10,6))
plt.plot(Germany2.diff())
[<matplotlib.lines.Line2D at 0x1e3600be9d0>]
Germany2=Germany2.diff()
Germany2=Germany2.fillna(0)
start_date='2021-04-12'
train1=Germany2.loc[Germany2.index < pd.to_datetime(start_date)]
test1=Germany2.loc[Germany2.index >= pd.to_datetime(start_date)]
from statsmodels.tsa.statespace.sarimax import SARIMAX
model1=SARIMAX(Germany2, order=(4, 1, 3))
results1=model1.fit(disp=True)
c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. c:\python\lib\site-packages\statsmodels\base\model.py:567: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
sarimax_prediction1=results1.predict(
start=start_date, end='2021-05-13', dynamic=False)
plt.figure(figsize=(10,5))
l1,=plt.plot(Germany2, label='Observation')
l2,=plt.plot(sarimax_prediction1, label='ARIMA')
plt.legend(handles=[l1,l2])
plt.ticklabel_format(style="plain", axis = 'y')
plt.savefig('SARIMAX prediction', bbox_inches='tight', transparent=False)
import pmdarima as pm
stepwise = pm.auto_arima(Germany2,trace=True,suppress_warnings=True)
stepwise.summary()
Performing stepwise search to minimize aic ARIMA(2,1,2)(0,0,0)[0] intercept : AIC=3547.874, Time=0.18 sec ARIMA(0,1,0)(0,0,0)[0] intercept : AIC=3655.540, Time=0.00 sec ARIMA(1,1,0)(0,0,0)[0] intercept : AIC=3647.933, Time=0.02 sec ARIMA(0,1,1)(0,0,0)[0] intercept : AIC=3641.569, Time=0.04 sec ARIMA(0,1,0)(0,0,0)[0] : AIC=3653.599, Time=0.01 sec ARIMA(1,1,2)(0,0,0)[0] intercept : AIC=3622.635, Time=0.05 sec ARIMA(2,1,1)(0,0,0)[0] intercept : AIC=3548.791, Time=0.11 sec ARIMA(3,1,2)(0,0,0)[0] intercept : AIC=3544.223, Time=0.18 sec ARIMA(3,1,1)(0,0,0)[0] intercept : AIC=3535.895, Time=0.18 sec ARIMA(3,1,0)(0,0,0)[0] intercept : AIC=3554.664, Time=0.10 sec ARIMA(4,1,1)(0,0,0)[0] intercept : AIC=3527.844, Time=0.32 sec ARIMA(4,1,0)(0,0,0)[0] intercept : AIC=inf, Time=0.14 sec ARIMA(5,1,1)(0,0,0)[0] intercept : AIC=3529.759, Time=0.75 sec ARIMA(4,1,2)(0,0,0)[0] intercept : AIC=3525.164, Time=0.40 sec ARIMA(5,1,2)(0,0,0)[0] intercept : AIC=3526.371, Time=0.93 sec ARIMA(4,1,3)(0,0,0)[0] intercept : AIC=3499.934, Time=1.11 sec ARIMA(3,1,3)(0,0,0)[0] intercept : AIC=3539.343, Time=0.66 sec ARIMA(5,1,3)(0,0,0)[0] intercept : AIC=3501.807, Time=0.67 sec ARIMA(4,1,4)(0,0,0)[0] intercept : AIC=3505.564, Time=0.65 sec ARIMA(3,1,4)(0,0,0)[0] intercept : AIC=3536.660, Time=0.37 sec ARIMA(5,1,4)(0,0,0)[0] intercept : AIC=3506.528, Time=0.98 sec ARIMA(4,1,3)(0,0,0)[0] : AIC=3502.516, Time=1.21 sec Best model: ARIMA(4,1,3)(0,0,0)[0] intercept Total fit time: 9.057 seconds
| Dep. Variable: | y | No. Observations: | 138 |
|---|---|---|---|
| Model: | SARIMAX(4, 1, 3) | Log Likelihood | -1740.967 |
| Date: | Tue, 01 Jun 2021 | AIC | 3499.934 |
| Time: | 02:28:08 | BIC | 3526.214 |
| Sample: | 0 | HQIC | 3510.613 |
| - 138 | |||
| Covariance Type: | opg |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| intercept | 1.272e+04 | 9754.945 | 1.304 | 0.192 | -6400.124 | 3.18e+04 |
| ar.L1 | -0.7155 | 0.066 | -10.889 | 0.000 | -0.844 | -0.587 |
| ar.L2 | -0.0087 | 0.055 | -0.158 | 0.874 | -0.116 | 0.099 |
| ar.L3 | -0.6823 | 0.058 | -11.688 | 0.000 | -0.797 | -0.568 |
| ar.L4 | -0.9627 | 0.044 | -21.755 | 0.000 | -1.049 | -0.876 |
| ma.L1 | 0.9367 | 0.141 | 6.655 | 0.000 | 0.661 | 1.213 |
| ma.L2 | -0.4434 | 0.149 | -2.973 | 0.003 | -0.736 | -0.151 |
| ma.L3 | -0.6169 | 0.138 | -4.477 | 0.000 | -0.887 | -0.347 |
| sigma2 | 5.967e+09 | 0.035 | 1.72e+11 | 0.000 | 5.97e+09 | 5.97e+09 |
| Ljung-Box (Q): | 35.70 | Jarque-Bera (JB): | 2618.35 |
|---|---|---|---|
| Prob(Q): | 0.66 | Prob(JB): | 0.00 |
| Heteroskedasticity (H): | 22.72 | Skew: | -2.41 |
| Prob(H) (two-sided): | 0.00 | Kurtosis: | 23.87 |
# This plot represents country wise vaccination usage across the world. The colors on the map represents
# the vaccines used for each country.
# For example, Germany uses the vaccines like jhonson & jhonson, Moderna, pfizer and Astrazeneca represented in light green
# color.
# Similarly the countries like France and Spain use the same vaccinations.
fig = px.choropleth(countryvaccination1, locations="iso_code",
color="vaccines",
hover_name="country",
color_continuous_scale=px.colors.sequential.Plasma,
title= "Vaccines used by different countries")
fig.update_layout(showlegend=False)
fig.show()
# This shows the vaccines and it is used by countries count. We can oberve that Oxford/AstraZeneca is
# used by most of the countries.
vaccine = countryvaccination1["vaccines"].value_counts().reset_index()
vaccine.columns = ['popular Vaccines','Number of Countries']
vaccine.nlargest(10,'Number of Countries')
| popular Vaccines | Number of Countries | |
|---|---|---|
| 0 | Oxford/AstraZeneca | 52 |
| 1 | Johnson&Johnson, Moderna, Oxford/AstraZeneca, ... | 13 |
| 2 | Moderna, Oxford/AstraZeneca, Pfizer/BioNTech | 12 |
| 3 | Oxford/AstraZeneca, Pfizer/BioNTech | 11 |
| 4 | Pfizer/BioNTech | 10 |
| 5 | Oxford/AstraZeneca, Sinopharm/Beijing | 9 |
| 6 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac | 8 |
| 7 | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | 8 |
| 8 | Sputnik V | 8 |
| 9 | Moderna, Pfizer/BioNTech | 6 |
# The top vaccines by countries.
vaccine1 = countryvaccinationbyMan1["vaccine"].value_counts().reset_index()
vaccine1.columns = ['popular Vaccines','Number of Countries']
vaccine1.nlargest(10,'Number of Countries')
| popular Vaccines | Number of Countries | |
|---|---|---|
| 0 | Pfizer/BioNTech | 10 |
| 1 | Johnson&Johnson | 9 |
| 2 | Moderna | 9 |
| 3 | Oxford/AstraZeneca | 9 |
| 4 | Sinovac | 1 |
#This plot represents the total number of vaccines count of each kind vaccination kind across the world.
#From the below graph we can say that Sinopharm/Beijing, Sinopharm/Wuhan, Sinovacn is used to vaccinated more people
# and pfizer /biotech and sputnik V are used less to vaccinate.
plt.figure(figsize=(10,7))
grp = ['country', 'total_vaccinations', 'iso_code', 'vaccines']
vacc_no = countryvaccination[grp].groupby('vaccines').max().sort_values('total_vaccinations', ascending=False).dropna(subset=['total_vaccinations'])
plt.bar(vacc_no.index, vacc_no.total_vaccinations, color ='g')
plt.title('Various categories of COVID-19 vaccines offered')
plt.xticks(rotation = 90)
plt.ylabel('Number of people vaccinated')
plt.xlabel('Vaccines')
plt.show();
#Plotting total number of vaccinations w.r.t manufacturers. From thsi plot we can say that pfizer/biotech and Moderna are used
# more for the vaccination. This dataset only consists of 10 countries for example, USA, Chile, and Romania etc.
x= manuVac.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
import seaborn as sns
plt.figure(figsize= (15,10))
ax= sns.barplot(x.index,x.values)
ax.set_ylabel("total vaccinations")
ax.set_xlabel("Manufracturer")
ax.set_title("Share of Vaccination accoding to manufracturer ")
plt.show()
# This plot represents the 10 countries w.r.t total vaccinations.
z= vacc.groupby("country")["total_vaccinations"].max().sort_values(ascending= False)
z['United States']=z['United States']/4
z.to_frame( )
countries={'Germany', 'France', 'Italy', 'Chile', 'Romania', 'Czechia', 'Lithuania', 'Latvia' ,'Iceland'}
a = z[z.index.str.contains('United States')]
for x in countries:
a=a.append(z[z.index.str.contains(x) ])
a=a.sort_values(ascending= False)
plt.figure(figsize= (15,10))
ax= sns.barplot(a.index,a.values)
ax.set_ylabel("total vaccinations")
ax.set_xlabel("Country")
ax.set_title("Country wise vaccination (USA normalized: Multiply by 4 for actual number)")
# plt.show()
# Creating dataframes to get the total vaccinationd and vaccines according to the manufacturer in each country.
usaVaccName=manuVac[manuVac.location.str.contains('United States')]
gerVaccName=manuVac[manuVac.location.str.contains('Germany')]
itaVaccName=manuVac[manuVac.location.str.contains('Italy')]
litVaccName=manuVac[manuVac.location.str.contains('Lithuania')]
fraVaccName=manuVac[manuVac.location.str.contains('France')]
czeVaccName=manuVac[manuVac.location.str.contains('Czechia')]
latVaccName=manuVac[manuVac.location.str.contains('Latvia')]
chiVaccName=manuVac[manuVac.location.str.contains('Chile')]
romVaccName=manuVac[manuVac.location.str.contains('Romania')]
iceVaccName=manuVac[manuVac.location.str.contains('Iceland')]
gerVaccNames=gerVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
latVaccNames=latVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
usaVaccNames=usaVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
itaVaccNames=itaVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
fraVaccNames=fraVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
czeVaccNames=latVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
chiVaccNames=chiVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
romVaccNames=romVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
iceVaccNames=iceVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
litVaccNames=litVaccName.groupby("vaccine")["total_vaccinations"].sum().sort_values(ascending= False)
#For these 10 countires, we have plotted the Share of each vaccination used.
#Vaccine rate of each manufacturer in respective countries.
#With respect to those 10 countries,
#we can see the pfizer/biotech is common and used more than 50% to vaccinate in all the respected countries.
# We can see that Chile has used the vaccine manufactured by China.
fig, ((ax1, ax2), (ax3,ax4), (ax5,ax6), (ax7, ax8),(ax9,ax10)) = plt.subplots(nrows=5, ncols=2,figsize=(15,20))
ax1.pie(gerVaccNames, labels = gerVaccNames.index,autopct='%1.2f%%')
ax1.set_title("Germany Vaccination Companies")
ax2.pie(usaVaccNames, labels = usaVaccNames.index,autopct='%1.2f%%')
ax2.set_title("USA Vaccination Companies")
ax3.pie(chiVaccNames, labels = chiVaccNames.index,autopct='%1.2f%%')
ax3.set_title("Chile Vaccination Companies")
ax4.pie(itaVaccNames, labels = itaVaccNames.index,autopct='%1.2f%%')
ax4.set_title("Italy Vaccination Companies")
ax5.pie(fraVaccNames, labels = fraVaccNames.index,autopct='%1.2f%%')
ax5.set_title("France Vaccination Companies")
ax6.pie(czeVaccNames, labels = czeVaccNames.index,autopct='%1.2f%%')
ax6.set_title("Czechia Vaccination Companies")
ax7.pie(latVaccNames, labels = latVaccNames.index,autopct='%1.2f%%')
ax7.set_title("Latvia Vaccination Companies")
ax8.pie(romVaccNames, labels = romVaccNames.index,autopct='%1.2f%%')
ax8.set_title("Romania Vaccination Companies")
ax9.pie(litVaccNames, labels = litVaccNames.index,autopct='%1.2f%%')
ax9.set_title("Lithuania Vaccination Companies")
ax10.pie(iceVaccNames, labels = iceVaccNames.index,autopct='%1.2f%%')
ax10.set_title("Iceland Vaccination Companies")
plt.tight_layout()
plt.show()
# Listwise countries represents for each vaccine used.
#From these clusters we can see that Oxford/Astrazeneca vaccine is used by many of the countries and
#vaccines like abdalo and cassino are used in a particular country.
vaccc = countryvaccination1["vaccines"].unique()
for i in vaccc:
c = list(countryvaccination1[countryvaccination1["vaccines"] == i]['country'])
print(f"Vaccine names: {i}n \n Used by countries: {c}\n\n")
print('<>'*50)
Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijingn Used by countries: ['Afghanistan', 'Maldives', 'Peru'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, Sputnik Vn Used by countries: ['Albania', 'Bosnia and Herzegovina'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Sputnik Vn Used by countries: ['Algeria', 'Armenia', 'Belarus', 'Guinea', 'Kazakhstan', 'Paraguay', 'Syria', 'Venezuela'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTechn Used by countries: ['Andorra', 'Australia', 'Cayman Islands', 'Costa Rica', 'Isle of Man', 'Oman', 'Panama', 'Saudi Arabia', 'Slovenia', 'South Korea', 'Sweden'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZenecan Used by countries: ['Angola', 'Anguilla', 'Antigua and Barbuda', 'Bahamas', 'Bangladesh', 'Barbados', 'Belize', 'Bhutan', 'Botswana', 'Brunei', 'Dominica', 'Eswatini', 'Ethiopia', 'Falkland Islands', 'Fiji', 'French Polynesia', 'Gambia', 'Georgia', 'Ghana', 'Grenada', 'Guyana', 'Jamaica', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi', 'Mali', 'Montserrat', 'Myanmar', 'Nauru', 'New Caledonia', 'Nigeria', 'Papua New Guinea', 'Saint Helena', 'Saint Lucia', 'Samoa', 'Sierra Leone', 'Solomon Islands', 'South Sudan', 'Sudan', 'Suriname', 'Taiwan', 'Tajikistan', 'Togo', 'Tonga', 'Trinidad and Tobago', 'Tuvalu', 'Uganda', 'Uzbekistan', 'Vietnam', 'Yemen', 'Zambia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Argentina', 'Djibouti'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Pfizer/BioNTechn Used by countries: ['Aruba', 'Bermuda', 'Gibraltar', 'Greenland', 'Japan', 'Kuwait', 'Monaco', 'New Zealand', 'Qatar', 'Slovakia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTechn Used by countries: ['Austria', 'Belgium', 'Bulgaria', 'France', 'Germany', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Netherlands', 'Poland', 'Romania', 'Spain'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinovacn Used by countries: ['Azerbaijan', 'Benin', 'Indonesia', 'Thailand'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Bahrain', 'Bolivia', 'Lebanon', 'Moldova', 'Mongolia', 'Montenegro', 'North Macedonia', 'Serbia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTech, Sinovacn Used by countries: ['Brazil', 'Chile', 'Colombia', 'Ecuador', 'El Salvador', 'Malaysia', 'Ukraine', 'Uruguay'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinopharm/Beijing, Sinovacn Used by countries: ['Cambodia', 'Dominican Republic', 'Somalia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Sinopharm/Beijingn Used by countries: ['Cameroon', 'Equatorial Guinea', 'Gabon', 'Mauritania', 'Senegal', 'Zimbabwe'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Moderna, Oxford/AstraZeneca, Pfizer/BioNTechn Used by countries: ['Canada', 'Croatia', 'Estonia', 'Finland', 'Greece', 'Iceland', 'Luxembourg', 'Malta', 'Norway', 'Portugal', 'Rwanda', 'United Kingdom'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Covaxin, Oxford/AstraZenecan Used by countries: ['Central African Republic', 'India'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Sinopharm/Beijing, Sinopharm/Wuhan, Sinovacn Used by countries: ['China'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Covaxin, Oxford/AstraZeneca, Sinopharm/Beijingn Used by countries: ['Comoros', 'Mauritius'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Moderna, Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Congo'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Abdalan Used by countries: ['Cuba'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Johnson&Johnson, Oxford/AstraZeneca, Pfizer/BioNTechn Used by countries: ['Cyprus'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Moderna, Pfizer/BioNTechn Used by countries: ['Denmark', 'Faeroe Islands', 'Israel', 'Liechtenstein', 'Singapore', 'Switzerland'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinopharm/Beijing, Sinovac, Sputnik Vn Used by countries: ['Egypt'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Moderna, Oxford/AstraZenecan Used by countries: ['Guatemala', 'Honduras'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinopharm/Beijingn Used by countries: ['Guinea-Bissau', 'Iraq', 'Morocco', 'Mozambique', 'Namibia', 'Nepal', 'Niger', 'Seychelles', 'Sri Lanka'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Pfizer/BioNTech, Sinovacn Used by countries: ['Hong Kong', 'Turkey'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Moderna, Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Hungary'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Covaxin, Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Iran'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Pfizer/BioNTech, Sinopharm/Beijingn Used by countries: ['Jordan', 'Macao'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sputnik Vn Used by countries: ['Kenya', 'Nicaragua'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Sinopharm/Beijing, Sputnik Vn Used by countries: ['Kyrgyzstan', 'Laos'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Johnson&Johnson, Moderna, Oxford/AstraZeneca, Pfizer/BioNTech, Sputnik Vn Used by countries: ['Libya'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: CanSino, Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, Sputnik Vn Used by countries: ['Mexico'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: CanSino, Sinopharm/Beijing, Sinovac, Sputnik Vn Used by countries: ['Pakistan'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Sinovac, Sputnik Vn Used by countries: ['Philippines'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: EpiVacCorona, Sputnik Vn Used by countries: ['Russia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Pfizer/BioNTech, Sputnik Vn Used by countries: ['San Marino'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Johnson&Johnsonn Used by countries: ['South Africa'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Pfizer/BioNTech, Sinovac, Sputnik Vn Used by countries: ['Tunisia'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: EpiVacCorona, Oxford/AstraZeneca, Sinopharm/Beijing, Sputnik Vn Used by countries: ['Turkmenistan'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm/Beijing, Sinopharm/Wuhan, Sputnik Vn Used by countries: ['United Arab Emirates'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> Vaccine names: Johnson&Johnson, Moderna, Pfizer/BioNTechn Used by countries: ['United States'] <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
cv = cv.fillna(0)
cv2 = cv.groupby(["country",'iso_code','vaccines'])['total_vaccinations',
'people_vaccinated','people_fully_vaccinated',
'daily_vaccinations','total_vaccinations_per_hundred',
'people_vaccinated_per_hundred',"people_fully_vaccinated_per_hundred"
,'daily_vaccinations_per_million'].max().reset_index()
cv2.head()
| country | iso_code | vaccines | total_vaccinations | people_vaccinated | people_fully_vaccinated | daily_vaccinations | total_vaccinations_per_hundred | people_vaccinated_per_hundred | people_fully_vaccinated_per_hundred | daily_vaccinations_per_million | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm... | 504502.0 | 448878.0 | 55624.0 | 13921.0 | 1.30 | 1.15 | 0.14 | 358.0 |
| 1 | Albania | ALB | Oxford/AstraZeneca, Pfizer/BioNTech, Sinovac, ... | 632676.0 | 444755.0 | 187921.0 | 17565.0 | 21.98 | 15.45 | 6.53 | 6104.0 |
| 2 | Algeria | DZA | Sputnik V | 75000.0 | 0.0 | 0.0 | 3748.0 | 0.17 | 0.00 | 0.00 | 85.0 |
| 3 | Andorra | AND | Oxford/AstraZeneca, Pfizer/BioNTech | 31633.0 | 26931.0 | 4702.0 | 1182.0 | 40.94 | 34.86 | 6.09 | 15298.0 |
| 4 | Angola | AGO | Oxford/AstraZeneca | 626572.0 | 586377.0 | 40195.0 | 18751.0 | 1.91 | 1.78 | 0.12 | 571.0 |
# Dataframe for vaccination per 100.
cvcluster = cv2[['people_fully_vaccinated_per_hundred','total_vaccinations_per_hundred']]
#Dataframe display for clustering
cvcluster.head()
| people_fully_vaccinated_per_hundred | total_vaccinations_per_hundred | |
|---|---|---|
| 0 | 0.14 | 1.30 |
| 1 | 6.53 | 21.98 |
| 2 | 0.00 | 0.17 |
| 3 | 6.09 | 40.94 |
| 4 | 0.12 | 1.91 |
cvcluster=cvcluster.loc[(cvcluster!=0.0).any(axis=1)]
# Normalizing the values
scaler = StandardScaler()
cvcluster1 = scaler.fit_transform(cvcluster)
# Elbow method to find out the number of clusters to choose.
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(cvcluster1)
distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
#Applying K-Means clustering.
kmeans = KMeans(n_clusters=3)
kmeans.fit(cvcluster1)
KMeans(n_clusters=3)
kmeans.labels_
array([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 2, 1, 0, 1, 0, 1, 1, 2,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0,
1, 1, 1, 0, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
0, 0, 2, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
2, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 2,
0, 2, 0, 1, 1, 1, 0, 0, 1, 1, 1])
cv2['Clustered'] = pd.Series(kmeans.labels_, index=cvcluster.index)
cv2['Clustered'].value_counts()
1.0 136 0.0 59 2.0 14 Name: Clustered, dtype: int64
cv2_0=cv2[cv2.Clustered == 0]
cv2_1=cv2[cv2.Clustered == 1]
cv2_2=cv2[cv2.Clustered == 2]
# This tree map represents the cluster 0 , which has the countries with average number of total vaccinations per hundred rate
# and the prople fully vaccinated rate. We can observe that this cluster has the countires from Europe continent and also has the
# moderate population
fig = px.treemap(cv2_0,values = 'total_vaccinations_per_hundred',
path = ['country','people_fully_vaccinated_per_hundred'],
title="Total Vaccinations per hundred for a country")
fig.show()
# This tree map has the countries which having the less total vaccinations per hundred rate and
#also people fully vaccinated rate. These countries mainly belongs to Africa and Asia continent.
fig = px.treemap(cv2_1,values = 'total_vaccinations_per_hundred',
path = ['country','people_fully_vaccinated_per_hundred'],
title="Total Vaccinations per hundred for a country")
fig.show()
# This tree map has the countries having highest number of total vaccinations per hundred
# and people fully vaccinated per hundred. But except United States all other countries have less population and
# all belongs to MiddleEast continent.
fig = px.treemap(cv2_2,values = 'total_vaccinations_per_hundred',
path = ['country','people_fully_vaccinated_per_hundred'],
title="Total Vaccinations per hundred for a country")
fig.show()
# Importing the Worldometer Covid cases dataset.
USA = pd.read_csv("owid-covid-data.csv")
USA = USA[USA.iso_code == 'USA']
# Importing the USA Covid cases dataset used for the previous task.
USAold = pd.read_csv("2021-04-21_covid19_daily.csv")
USAold.head()
| date | states | positive | negative | pending | hospitalizedCurrently | hospitalizedCumulative | inIcuCurrently | inIcuCumulative | onVentilatorCurrently | ... | totalTestResults | lastModified | total | posNeg | deathIncrease | hospitalizedIncrease | negativeIncrease | positiveIncrease | totalTestResultsIncrease | hash | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20201206 | 56 | 14534035 | 161986294 | 13592.0 | 101487.0 | 585676.0 | 20145.0 | 31946.0 | 7094.0 | ... | 204063869 | 2020-12-06T24:00:00Z | 0 | 0 | 1138 | 2256 | 1172590 | 176771 | 1634532 | 9cf16504f91958e803a2197daf8c2528a4eddc18 |
| 1 | 20201205 | 56 | 14357264 | 160813704 | 13433.0 | 101190.0 | 583420.0 | 19950.0 | 31831.0 | 7005.0 | ... | 202429337 | 2020-12-05T24:00:00Z | 0 | 0 | 2445 | 3316 | 1526995 | 211073 | 2169756 | 6249216c5f097c94ce33a811dab011a483a42404 |
| 2 | 20201204 | 56 | 14146191 | 159286709 | 12714.0 | 101276.0 | 580104.0 | 19858.0 | 31608.0 | 6999.0 | ... | 200259581 | 2020-12-04T24:00:00Z | 0 | 0 | 2563 | 4652 | 1260657 | 224831 | 1854869 | ae30ea088584335ba4d57ee927f8dbda6add74db |
| 3 | 20201203 | 56 | 13921360 | 158026052 | 15106.0 | 100755.0 | 575452.0 | 19723.0 | 31276.0 | 6867.0 | ... | 198404712 | 2020-12-03T24:00:00Z | 0 | 0 | 2706 | 5331 | 1238465 | 210204 | 1828230 | 0f253d185ecb336cdd18a4c61996eda1b7eef13b |
| 4 | 20201202 | 56 | 13711156 | 156787587 | 14368.0 | 100322.0 | 570121.0 | 19680.0 | 31038.0 | 6855.0 | ... | 196576482 | 2020-12-02T24:00:00Z | 0 | 0 | 2733 | 5028 | 982032 | 195796 | 1459202 | 477c17b6302d0485195e77ecf8270a974f7a3c82 |
5 rows × 25 columns
USAold = USAold[['date','positiveIncrease','deathIncrease']]
USAold['new_cases'] = USAold[['positiveIncrease']]
USAold['new_deaths'] = USAold[['deathIncrease']]
USAold['date'] = pd.to_datetime(USAold['date'].astype(str), format='%Y/%m/%d')
USAold = USAold.sort_values(by='date')
USAold = USAold[['date','new_cases','new_deaths']]
start_date='2020-12-07'
end_date = '2021-05-13'
USA['date'] = pd.to_datetime(USA['date'])
USAnew=USA.loc[USA.date >= pd.to_datetime(start_date)]
USAnew=USAnew.loc[USA.date <= pd.to_datetime(end_date)]
USAnew.head()
| iso_code | continent | location | date | total_cases | new_cases | new_cases_smoothed | total_deaths | new_deaths | new_deaths_smoothed | ... | gdp_per_capita | extreme_poverty | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | handwashing_facilities | hospital_beds_per_thousand | life_expectancy | human_development_index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 87830 | USA | North America | United States | 2020-12-07 | 15108918.0 | 194858.0 | 205512.286 | 287342.0 | 1606.0 | 2331.714 | ... | 54225.446 | 1.2 | 151.089 | 10.79 | 19.1 | 24.6 | NaN | 2.77 | 78.86 | 0.926 |
| 87831 | USA | North America | United States | 2020-12-08 | 15333410.0 | 224492.0 | 210694.143 | 289963.0 | 2621.0 | 2342.429 | ... | 54225.446 | 1.2 | 151.089 | 10.79 | 19.1 | 24.6 | NaN | 2.77 | 78.86 | 0.926 |
| 87832 | USA | North America | United States | 2020-12-09 | 15555949.0 | 222539.0 | 213548.714 | 293146.0 | 3183.0 | 2395.000 | ... | 54225.446 | 1.2 | 151.089 | 10.79 | 19.1 | 24.6 | NaN | 2.77 | 78.86 | 0.926 |
| 87833 | USA | North America | United States | 2020-12-10 | 15787464.0 | 231515.0 | 214677.571 | 296141.0 | 2995.0 | 2400.571 | ... | 54225.446 | 1.2 | 151.089 | 10.79 | 19.1 | 24.6 | NaN | 2.77 | 78.86 | 0.926 |
| 87834 | USA | North America | United States | 2020-12-11 | 16027441.0 | 239977.0 | 215705.000 | 299559.0 | 3418.0 | 2503.286 | ... | 54225.446 | 1.2 | 151.089 | 10.79 | 19.1 | 24.6 | NaN | 2.77 | 78.86 | 0.926 |
5 rows × 59 columns
USAnew = USAnew[['date','new_cases','new_deaths']]
#USA new dataset from december 2020 to may 2021
USAnew.head()
| date | new_cases | new_deaths | |
|---|---|---|---|
| 87830 | 2020-12-07 | 194858.0 | 1606.0 |
| 87831 | 2020-12-08 | 224492.0 | 2621.0 |
| 87832 | 2020-12-09 | 222539.0 | 3183.0 |
| 87833 | 2020-12-10 | 231515.0 | 2995.0 |
| 87834 | 2020-12-11 | 239977.0 | 3418.0 |
frames = [USAold, USAnew]
USA1 = pd.concat(frames)
#USA old dataset from Jan 2020 to Dec 2020
USA1.head()
| date | new_cases | new_deaths | |
|---|---|---|---|
| 319 | 2020-01-22 | 0.0 | 0.0 |
| 318 | 2020-01-23 | 0.0 | 0.0 |
| 317 | 2020-01-24 | 0.0 | 0.0 |
| 316 | 2020-01-25 | 0.0 | 0.0 |
| 315 | 2020-01-26 | 0.0 | 0.0 |
UnitedStatesnew = UnitedStates[['date','daily_vaccinations']]
USA1.reset_index(drop=True, inplace=True)
UnitedStatesnew.reset_index(drop=True, inplace=True)
UnitedStatesnew['date'] = pd.to_datetime(UnitedStatesnew['date'].astype(str), format='%Y/%m/%d')
sub_df = UnitedStatesnew[["date","daily_vaccinations"]]
US = USA1.merge(sub_df, how="outer")
US = US.fillna(0)
#Final USA Dataset after merging old and new ones with the daily vaccinations.
US
| date | new_cases | new_deaths | daily_vaccinations | |
|---|---|---|---|---|
| 0 | 2020-01-22 | 0.0 | 0.0 | 0.0 |
| 1 | 2020-01-23 | 0.0 | 0.0 | 0.0 |
| 2 | 2020-01-24 | 0.0 | 0.0 | 0.0 |
| 3 | 2020-01-25 | 0.0 | 0.0 | 0.0 |
| 4 | 2020-01-26 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... |
| 473 | 2021-05-09 | 21392.0 | 242.0 | 2017931.0 |
| 474 | 2021-05-10 | 36898.0 | 401.0 | 2117025.0 |
| 475 | 2021-05-11 | 33651.0 | 677.0 | 2194787.0 |
| 476 | 2021-05-12 | 35878.0 | 851.0 | 2159146.0 |
| 477 | 2021-05-13 | 38087.0 | 806.0 | 2088962.0 |
478 rows × 4 columns
#This plot represents the Daily Confirmed cases vs Daily Vaccination rate. From this graph we can see that,
#The daily cases decreases as the vaccination rate increases.
from plotly.offline import iplot
new_cases = go.Scatter(x=US['date'], y=US['new_cases'], yaxis='y1', name='Daily confirmed cases')
daily_vaccinations = go.Scatter(x=US['date'], y=US['daily_vaccinations'], yaxis='y2', name='Daily vacctinations')
layout_obj = go.Layout(title='USA COVID vs. USA Vaccinations', xaxis=dict(title='Date'), yaxis=dict(title='Daily cases'), yaxis2=dict(title='Vacctinations', side='right', overlaying='y'))
fig = go.Figure(data=[new_cases, daily_vaccinations], layout=layout_obj)
iplot(fig)
#This plot represents the Daily deaths vs Daily Vaccination rate. From this graph we can see that,
#The daily deaths decreases as the vaccination rate increases.
from plotly.offline import iplot
new_cases = go.Scatter(x=US['date'], y=US['new_deaths'], yaxis='y1', name='Daily deaths')
daily_vaccinations = go.Scatter(x=US['date'], y=US['daily_vaccinations'], yaxis='y2', name='Daily vacctinations')
layout_obj = go.Layout(title='USA COVID vs. USA Vaccinations', xaxis=dict(title='Date'), yaxis=dict(title='Daily deaths'), yaxis2=dict(title='Vacctinations', side='right', overlaying='y'))
fig = go.Figure(data=[new_cases, daily_vaccinations], layout=layout_obj)
iplot(fig)